# 06_part_ii_clustering
# Identifies clusters based on distance matrix 
# Produces:
# - Figure S3: Average silhouette width for 2- to 10-cluster solutions
# - Table S9: Cluster Quality Metrics
# - Main Table 4: Six Clusters of Prioritization Policies
# - Main Table 5: Characteristics of Housing Authorities in 
#                 Prioritization Policy Clusters
# - Table A2. Cluster Medoids

library(tidyverse)
library(igraph)
library(magrittr)
library(here)
options(scipen=999)

# Load coded preference data
codes <- readRDS(here("data", "codes_expanded.RDS"))
# Load data on PHAs in sample
pha_pref_ref <- readRDS(here("data", "pha_pref_ref.RDS"))
# Load all edges 
all_edges <- readRDS(here("data", "intermediate", "all_edges.RDS"))
# Load data on PHA characteristics
pha_all <-  readRDS(here("data", "pha_all.RDS")) %>%
  # Create variable for percent of all vouchers administered
  # by given PHA
  mutate(hud_perc_vouchers = hud_med_month_vouchers / sum(hud_med_month_vouchers) * 100)

# Load all distance matrixes
filepath <- here("data","intermediate")

# Retrieve distance matrixes calculated in 05_
get_dist_mat <- function(file, measure) {
  dist_mat <- readRDS(file)
  if (measure != "dc") {
    dist_mat <- dist_mat$D
  }
  return(dist_mat)
}

# Assemble distance matrixes in a dataframe
# Used for comparing distributions 
dist_mats <- data.frame(fns = list.files(filepath)) %>%
  filter(str_detect(fns, "pagerank")) %>%
  separate(fns, into = c("dist", "measure", "imp", "imp_weight", 
                         "exp", "exp_weight", "damp", "damp_factor"), 
           remove = FALSE, extra = "drop") %>%
  mutate_at(vars(ends_with("_weight")), as.numeric) %>%
  dplyr::select(fns, measure, imp_weight, exp_weight, damp_factor) %>%
  mutate(mat_label = paste(measure, imp_weight, exp_weight, damp_factor, 
                           sep = "_")) %>%
  group_by(mat_label) %>%
  mutate(dist_mat = map(paste0(filepath, "/", fns), get_dist_mat, measure)) %>%
  mutate(dist_num = map(dist_mat, as.numeric)) 

dist_df <- dist_mats %>%
  dplyr::select(-dist_mat) %>%
  unnest(cols = c("dist_num")) 

############################################
## Create summary of preferences by PHA   ## 
## for use later to characterize clusters ##
############################################

# Create df with all PHAs in sample and their 
# preferences, including all PHAs with no preferences
abt_rows <- pha_pref_ref %>%
  filter(any_preferences == 0 & from_abt == TRUE) %>%
  dplyr::select(pha_code)

coded_prefs <- codes %>%
  dplyr::select(pha_code:special_defs) %>%
  # Add rows for Abt entries
  bind_rows(abt_rows) %>%
  # Zero out rows with no preferences from Abt additions
  mutate_at(vars(unit_single:special_defs), list(~ifelse(is.na(.), 0, .))) 

# Get summary of the preferences used by each PHA for later analysis 
pref_summary <- coded_prefs %>%
  dplyr::select(-order) %>%
  group_by(pha_code) %>%
  summarize_all(funs(sum)) %>%
  arrange(pha_code)

# Binary version of use at all of the above
pref_summary_bin <- coded_prefs %>%
  dplyr::select(-order) %>%
  group_by(pha_code) %>%
  summarize_all(funs(max)) %>%
  arrange(pha_code)

pha_codes_sorted <- pref_summary$pha_code
# Sort PHA names in order of their PHA code
pha_names_sorted <- pha_all %>% 
  semi_join(filter(pha_pref_ref, in_sample_abt), by = "pha_code") %>% 
  arrange(pha_code) %>% 
  pull(pha_name)

################
## CLUSTERING ##
################

library(cluster)

# Select distance matrix
row_index <- which(dist_mats$mat_label == "pagerank_25_100_85")
input_dist_mat <- dist_mats$dist_mat[[row_index]]

# Set possible K's to test 
k_values <- 2:10

# Function to generate pam solution, given the dist_mat, k and 
get_pam_solution <- function(k, in_dist) {
  pam_obj <- pam(in_dist, k = k, diss = TRUE, keep.diss = FALSE, 
                 keep.data = FALSE)
  return(pam_obj)
}

# Get PAM solutions based on dist mat from PageRank with damping factor of 0.85
# Breaking out into two steps just to save time on computing 
pam_85_init <- data.frame(k = k_values,
                          damp_factor = 85) %>%
  group_by(k, damp_factor) %>%
  mutate(pam_obj = map(k, get_pam_solution, 
                       dist_mats$dist_mat[[which(dist_mats$mat_label == 
                                                   "pagerank_25_100_85")]])) 

####################################
# Evaluate cluster solution quality 
# across different values of k
####################################

# Calculate generalized silhouette width 
# Lengyel, A. and Z. Botta-Dukat. 2019. Silhouette width using generalized mean.
# Functions to return avg generalized silhouette width
# Calculate average silhouette width, cluster average silhouette width
# and proportion positive for selected cluster solution 
library(optpart)

set_ps <- c(-3, -1, 0, 1)
calc_avg_sil <- function(set_p, pam_obj) {  
  general_sw <- optpart::gensilwidth(pam_obj, dist = input_dist_mat, p = set_p)
  return(mean(general_sw[,'sil_width']))
} 

# Calculate cluster average silhouette  
calc_clusavg_sil <- function(set_p, pam_obj) {  
  general_sw <- optpart::gensilwidth(pam_obj, dist = input_dist_mat, p = set_p)
  return(mean(summary(general_sw)$clus.avg.widths))
} 

# Calculate proportion of silhouette positive 
calc_proppos_sil <- function(set_p, pam_obj) {  
  general_sw <- optpart::gensilwidth(pam_obj, dist = input_dist_mat, p = set_p)
  return(mean(general_sw[,'sil_width'] > 0))
} 

pam_85 <- pam_85_init %>%
  # Generate rows for all p values
  expand(set_ps) %>%
  left_join(pam_85_init, by = c("k", "damp_factor")) %>%
  # Retrieve specific attributes of silhouette solution 
  # Calculate average silhouette width
  mutate(avg_sil_width = map2_dbl(set_ps, pam_obj, calc_avg_sil),
         clusavg_sil_width = map2_dbl(set_ps, pam_obj, calc_clusavg_sil),
         prop_sil_pos = map2_dbl(set_ps, pam_obj, calc_proppos_sil))
                             
# Produce Figure S3: Average silhouette width for 2- to 10-cluster solutions
# (ONLINE SUPPLEMENT)
# Produce plots of silhouette-based metric values by k
library(scales)
ggplot(pam_85, aes(x = k, y = avg_sil_width, group = set_ps)) +
  geom_line(aes(color = as.factor(set_ps))) +
  geom_point(aes(color = as.factor(set_ps))) + 
  xlab("K") + ylab("Average silhouette width") + 
  guides(color=guide_legend(title="p")) +
  theme_minimal() + 
  scale_x_continuous(breaks= pretty_breaks())

ggsave(here("figs", "sil_k.png"), plot = last_plot(),
       device = "png", width = 10, height = 9,
       dpi = 300)

# Produce Table S9: Cluster Quality Metrics
# (ONLINE SUPPLEMENT)
set_k <- 6
sil_df <- pam_85 %>%
  ungroup() %>%
  filter(k == set_k) %>%
  dplyr::select(set_ps, avg_sil_width, clusavg_sil_width, prop_sil_pos) %>%
  gather(key = "key", value = "value", -set_ps) %>%
  spread(key = set_ps, value = "value")
print(xtable::xtable(sil_df), include.rownames  = FALSE)

##########################################################
# Examine what the different cluster solutions look like
# across different k (Online Supplement)
##########################################################

# Summarize prioritization policy depth and complexity 
# for observations with preferences
codes_summary <- codes %>%
  filter(any_preferences == 1) %>%
  group_by(pha_code) %>%
  # Operationalize depth as number of distinct levels of hierarchy
  summarize(depth = max(order),
            # Operationalize complexity as number of distinct rows of preferences
            complexity = n()) 

# Bind with rows for housing authorities with no preferences
# where depth and complexity are set to 0
elaboration_summary <- pha_pref_ref %>%
  filter(any_preferences == 0) %>%
  transmute(pha_code,
            depth = 0,
            complexity = 0) %>%
  bind_rows(codes_summary) %>%
  arrange(pha_code)

# Function to retrieve median levels of priority
# prioritization policies feature
get_depth <- function(pam_obj) {
  depth_out <- elaboration_summary %>%
    mutate(cluster = pam_obj$clustering) %>%
    group_by(cluster) %>%
    summarize(med_depth = median(depth)) %>%
    arrange(cluster)
  return(depth_out$med_depth)
}

# Function to retrieve median number of preferences
# prioritization policies feature
get_complexity <- function(pam_obj) {
  complexity_out <- elaboration_summary %>%
    mutate(cluster = pam_obj$clustering) %>%
    group_by(cluster) %>%
    summarize(med_complex = median(complexity)) %>%
    arrange(cluster)
  return(complexity_out$med_complex)
}

# Characterize cluster based on the proportion of PHAs in each cluster
# that use each category
get_pref_use_summary <- function(pam_obj, pref_summary_in) {
  use_summary <- pref_summary_in %>%
    mutate(pam_clust_num = pam_obj$clustering) %>%
    group_by(pam_clust_num) %>%
    summarize_if(is.numeric, mean) 
  return(use_summary)
} 

# Calculate LIFT, Adapted from stm package 
# First argument should be the calculated pref_use_prop
calc_lift <- function (pref_use_p, catcounts) {
  logbeta <- as.matrix(log(pref_use_p[,-1]))
  emp.prob <- log(catcounts) - log(sum(catcounts))
  lift <- logbeta - rep(emp.prob, each = nrow(logbeta))
  #t(apply(lift, 1, order, decreasing = TRUE))
  return(lift)
}

# Set up for various inputs to be used 
# Calculate overall number of times each category is used all plans 
cat_counts <- pref_summary %>% 
  dplyr::select(-pha_code) %>% 
  summarize_all(sum) %>%
  as.numeric

cats <- colnames(pref_summary[,-1])

# Create version of proportion where it's deviation from the average to see what's identifiable 
pref_summary_avg <- pref_summary_bin %>%
  summarize_if(is.numeric, mean) %>%
  as.numeric()

pam_85_summary <- pam_85_init %>%
  # Get medoids 
  mutate(medoid = map(pam_obj, function(x) pha_codes_sorted[x$medoids]),
         medoid_name = map(pam_obj, function(x) pha_names_sorted[x$medoids]),
         # get cluster sizes 
         sizes = map(pam_obj, function(x) x$clusinfo[,"size"]),
         med_depth = map(pam_obj, get_depth),
         med_complex = map(pam_obj, get_complexity),
         pref_use_count =  map(pam_obj, get_pref_use_summary, pref_summary),
         pref_use_prop =  map(pam_obj, get_pref_use_summary, pref_summary_bin)
         ) %>%
  mutate(lift = map(pref_use_prop, calc_lift, cat_counts)) 


pam_85_summary %>%
  dplyr::select(medoid, medoid_name, pref_use_prop) %>%
  unnest(cols = c(medoid, medoid_name, pref_use_prop)) %>%
  mutate(cluster_num = row_number()) 

###########################################
## Summarize selected 6 cluster solution ##
###########################################

# Load PageRank of different categories in each prioritization policy
pageranks <- readRDS(here("data", "intermediate", 
                          dist_mats[row_index,]$fns))$features 

# Set column names
colnames(pageranks) <- readRDS(here("data", "intermediate", 
                                    "pr_colnames__imp_25_exp_100_damp_85.RDS"))

# Select solution for K=6
pam_85_k6 <- pam_85_init[pam_85_init$k == set_k,]$pam_obj

# Map for manually reordering the 6 cluster solution
# by elaboration
cluster_map <- data.frame(old_cluster = 1:set_k, 
                          new_cluster = c(5,6,1,2,4,3))

# Get cluster assignments
clusters_original <- pam_85_k6[[1]]$clustering
cluster_medoids_original <- pam_85_summary[pam_85_summary$k == set_k,]$medoid[[1]]

# Create dataframe of PageRanks by housing authority
pref_pr <- pref_summary %>% 
  dplyr::select(pha_code) %>%
  mutate(old_cluster = clusters_original) %>%
  left_join(cluster_map, by = "old_cluster") %>%
  bind_cols(as.data.frame(pageranks)) 

# Create tidy version
pref_pr_tidy <- pref_pr %>%
  pivot_longer(ami_30to40:working, names_to = "category", values_to = "pagerank")

# Produce Main Table 4 
# Create summary of median PageRank by cluster
pr_summary_med <- pref_pr_tidy %>%
  group_by(new_cluster, category) %>%
  summarize(median_pagerank = median(pagerank)) %>%
  mutate(new_colname = paste0("cl", new_cluster, "_median_pagerank")) %>%
  ungroup() %>%
  dplyr::select(-new_cluster) %>%
  pivot_wider(id_cols = category, names_from = new_colname, values_from = median_pagerank) %>%
  mutate(cl1_rank = dense_rank(-cl1_median_pagerank),
         cl2_rank = dense_rank(-cl2_median_pagerank),
         cl3_rank = dense_rank(-cl3_median_pagerank),
         cl4_rank = dense_rank(-cl4_median_pagerank),
         cl5_rank = dense_rank(-cl5_median_pagerank),
         cl6_rank = dense_rank(-cl6_median_pagerank))

# Summary stats on prioritization policy clusters 
# under reassigned cluster numbers
cluster_summary_short <- data.frame(old_cluster = 1:set_k,
                                    size = pam_85_summary[pam_85_summary$k == set_k,]$sizes[[1]],
                                    med_levels = pam_85_summary[pam_85_summary$k == set_k,]$med_depth[[1]],
                                    med_unique_pref = pam_85_summary[pam_85_summary$k == set_k,]$med_complex[[1]]) %>%
  mutate(perc = 100 * size / sum(size)) %>%
  left_join(cluster_map, by = "old_cluster") %>%
  mutate(cluster_name = paste0("C", new_cluster)) %>%
  dplyr::select(cluster_name, med_levels, med_unique_pref, perc) %>%
  arrange(cluster_name)


###########################################
# Get characteristics of PHAs in clusters #
###########################################
# Produce Main Table 5

# Merge cluster assignments with PHA characteristics 
pha_character <-  pref_summary %>% 
  dplyr::select(pha_code) %>%
  mutate(old_cluster = clusters_original) %>%
  left_join(cluster_map, by = "old_cluster") %>%
  left_join(pha_all, by = "pha_code") %>%
  rename(cluster = new_cluster)

# Generate descriptives for PHAs in given df 
# Expects dataframe corresponding to the PHAs in a given cluster
generate_full_desc <- function(samp_df) {
  # Get sum of % of vouchers covered by PHAs in each cluster
  descriptives_1a <- samp_df %>%
    summarize(n = n(),
              perc_vouchers_administered = sum(hud_perc_vouchers) / 100)
  
  # Descriptives for continuous variables
  descriptives_1b <- samp_df %>%
    transmute(hud_med_month_vouchers, 
              hud_count_all_units,
              derived_acs_not_hispanic_or_latinowhite_alone_percent, 
              derived_acs_not_hispanic_or_latinoblack_or_african_american_alone_percent, 
              derived_acs_not_hispanic_or_latinoasian_alone_percent,
              derived_acs_hisp_any_perc, derived_acs_veteran_percent, 
              derived_acs_below_100_percent_of_the_poverty_level_percent, 
              derived_acs_in_the_labor_forceunemployed_percent,
              derived_acs_living_in_household_with_supplemental_security_income_ssi_cash_public_assistance_income_or_food_stampssnap_in_the_past_12_months_percent,
              derived_median_hh_income, derived_median_rental_burden,
              derived_acs_renter_occupied_percent, derived_acs_vacant_percent,
              perc_tracts_urban, 
              urban_affordable_noassist_per100, 
              urban_affordable_per100, 
              sharkey_commorg_densper_1000_inpov, 
              human_services_densper_1000_inpov, 
              repub_2016) %>%
    summarize_all(list(~median(., na.rm = TRUE)))
  
  # Descriptives for categorical variables 
  descriptives_2 <- samp_df %>%
    group_by(coc_category_noimpute) %>%
    summarize(count = n()) %>%
    ungroup() %>%
    mutate(prop = (count / sum(count)) * 100) %>%
    dplyr::select(coc_category_noimpute, prop) %>%
    rename(variable = coc_category_noimpute,
           value = prop) %>%
    filter(variable == "Local COC")
  
  descriptives_3 <- samp_df %>%
    group_by(census_region) %>%
    summarize(count = n()) %>%
    ungroup() %>%
    mutate(prop = (count / sum(count)) * 100) %>%
    dplyr::select(census_region, prop) %>%
    transmute(variable = census_region,
              value = prop) 
  
  descriptives <- bind_cols(descriptives_1a, descriptives_1b) %>%
    pivot_longer(everything(), names_to = "variable") %>%
    bind_rows(descriptives_2,
              descriptives_3) %>%
    filter(!is.na(variable))
  
  return(descriptives)
}

# Generate summaries by cluster 
cluster_character_full <- pha_character %>%
  group_by(cluster) %>%
  nest() %>%
  mutate(desc_1 = purrr::map(data, generate_full_desc)) %>%
  dplyr::select(-data) %>%
  unnest(cols = c(desc_1)) %>%
  mutate(cluster_name = paste0("C", cluster)) %>%
  ungroup() %>%
  dplyr::select(-cluster) %>%
  pivot_wider(names_from = cluster_name, values_from = value) %>%
  mutate_at(vars(starts_with("C")), 
            funs(ifelse(str_detect(variable, 
                                   "_percent$|_perc|perc_|^repub"), . * 100, .))) %>%
  dplyr::select(variable, paste0("C", 1:set_k))

cluster_character_labeled <- cluster_character_full  %>%
  mutate(Variable_label = c("Observations",
                            "% of vouchers administered in cluster",
                            "Vouchers administered", 
                            "Total units served",
                            "% White alone", "% Black alone", "% Asian alone", 
                            "% Hispanic",
                            "% Veteran", "% Below poverty", 
                            "% Unemployed", "% Receiving SSI/TANF/SNAP", 
                            "Median household income", "Median rent burden",
                            "% Units renter-occupied", "% Units vacant",
                            "% Census tracts in metropolitan area",
                            "# Affordable per 100, without assistance", 
                            "# Affordable per 100, with assistance",
                            "# Community organizations per 1,000", 
                            "# Human services organizations per 1,000", 
                            "% Republican vote, 2016", 
                            "% with local-level CoC",
                            "% Midwest", "% Northeast", "% South", "% West"))

# Print cluster 
print(xtable::xtable(cluster_character_labeled %>%
                       dplyr::select(Variable_label, starts_with("C")), 
                     digits = 0, align = "lrrrrrrr", 
                     caption = "Characteristics of preference system clusters"), 
      comment = F, table.placement = "!htpb",
      include.rownames = FALSE)

# Print medoids
med_ref <- data.frame(cluster = 1:length(cluster_medoids_original), 
                      medoid = cluster_medoids_original) %>%
  left_join(cluster_map, by = c("cluster" = "old_cluster")) %>%
  dplyr::select(-cluster) %>%
  rename(cluster = new_cluster) %>%
  mutate(cluster_name = paste0("C", cluster)) %>%
  dplyr::select(-cluster) %>%
  spread(key = cluster_name, value = medoid) %>%
  mutate(variable = "medoid") %>%
  dplyr::select(variable, paste0("C", 1:set_k))

print(xtable::xtable(med_ref, 
                     digits = 0, align = "lrrrrrrr", 
                     caption = "Medoids"), comment = F, 
      table.placement = "!htpb",
      include.rownames = FALSE)

# Helper function that prints out summaries of preferences
# given PHA code 
get_prefs <- function(set_pha_code, codes) {
  codes_sub <- codes %>%
    filter(pha_code == set_pha_code) 
  # Subset to nonzero columns 
  code_cols <- codes_sub[,-c(1,2)]
  i <- (colSums(code_cols) != 0) 
  matnonzero <- code_cols[, i]
  out <- bind_cols(data.frame(order = codes_sub$order),
                   matnonzero)
  return(out)
}

# Produce summary that is basis of Table A2. Cluster Medoids
# Print out preferences in medoids 
lapply(med_ref[,-1],
       FUN = get_prefs, coded_prefs)

# Store clustering results for later mapping
saveRDS(transmute(pref_pr, pha_code, cluster_num = new_cluster), 
        here("data", "intermediate", "cluster_assignments.RDS"))


